## Gender Stereotypes in Digital Science Communication Project
## JCMC special issue, 2023


# Step 1: Install the Structural Topic Model Package
install.packages("stm")
library("stm") #version: ‘1.3.6.1’
library(tm)
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)

# Set up your own working directory 
setwd(".././") #Specify the working directory here which should be the folder on your computer where R will look for and save files. setwd("/path/to/your/directory")

# Import the dataset this STM article used
data <- read.csv("../moral scored_final_dataset_march_10_2023_FemaleMaleOnly.csv") #1,756 observations, 7 columns
colnames(data) 

#######################################################
######## Pre-processing starts here here ##############
#######################################################
data$raw_video_description <- data$video_description

data$video_description <- as.character(data$video_description)
data$video_description <- tolower(data$video_description)
data$video_description

#Substitute
data$video_description = gsub("i'm", "i am", data$video_description)
data$video_description = gsub("you're", "you are", data$video_description)
data$video_description = gsub("he's", "he is", data$video_description)
data$video_description = gsub("she's", "she is", data$video_description)
data$video_description = gsub("they're", "they are", data$video_description)
data$video_description = gsub("we're", "we are", data$video_description)
data$video_description = gsub("i'll", "i will", data$video_description)
data$video_description = gsub("you'll", "you will", data$video_description)
data$video_description = gsub("he'll", "he will", data$video_description)
data$video_description = gsub("she'll", "she will", data$video_description)
data$video_description = gsub("they'll", "they will", data$video_description)
data$video_description = gsub("we'll", "we will", data$video_description)
data$video_description = gsub("don't", "do not", data$video_description)
data$video_description = gsub("doesn't", "does not", data$video_description)
data$video_description = gsub("didn't", "did not", data$video_description)
data$video_description = gsub(" w ", " with ", data$video_description)
data$video_description = gsub(" w/o ", " without ", data$video_description)
data$video_description = gsub(" that's ", " that is ", data$video_description)
data$video_description = gsub(" what's ", " what is ", data$video_description)
data$video_description = gsub(" won't ", " will not ", data$video_description)
data$video_description = gsub(" dems ", " democrats ", data$video_description)
data$video_description = gsub(" ppl ", " people ", data$video_description)
data$video_description = gsub(" pic ", " picture ", data$video_description)
data$video_description = gsub(" pres ", " president ", data$video_description)
data$video_description = gsub(" u ", " you ", data$video_description)
data$video_description = gsub(" ur ", " your ", data$video_description)
data$video_description = gsub(" it's ", " it is ", data$video_description)
data$video_description = gsub(" heard ", " hear ", data$video_description)
data$video_description = gsub(" gov ", " government ", data$video_description)
data$video_description = gsub(" govern ", " government ", data$video_description)
data$video_description = gsub(" govt ", " government ", data$video_description)
data$video_description = gsub("pic.twitter.*", "", data$video_description)
data$video_description <- gsub("(http|https)://[[:alnum:]/\\.]+", "", data$video_description)

#replace all punctuation characters with a space character
data$video_description = gsub("[[:punct:]]+", " ", data$video_description)
#replace all digit numbers with an empty string
data$video_description = gsub("\\d+", "", data$video_description)

#remove leading and trailing white spaces
data$video_description = str_trim(data$video_description)
data$video_description = str_squish(data$video_description)

#remove stop words
stop <- readLines("stopwords-whole.txt", encoding = "UTF-8")
data$video_description  =  removeWords(data$video_description,stop)

#######################################################
############### Pre-processing ends here ##############
#######################################################


# Step 2: Preparation for Text Analysis
processed <- textProcessor(data$video_description, metadata = data)
  # check which documents in data are deleted in the processed step
processed$docs.removed # integer(0) means that no document is deleted in this case.
  # if there are documents deleted in the above step are you check, then you need to run the below code line to refine your original data
data <- data[-processed$docs.removed,] #. This code is to refine the dataset when there are documents removed. If you do not run this line, you will run into error when you later run the function findThoughts()

out <- prepDocuments(processed$documents, processed$vocab, processed$meta) 
  # check which documents are removed in the above prepDocuments step
out$docs.removed # NULL means no documents are removed
  # if there are documents removed, you need to run the below code line to refine the raw dataset
data <- data[-out$docs.removed,] # This code is to again refine the dataset when there are documents removed

docs <- out$documents
vocab <- out$vocab
meta <- out$meta

#====== Search k ======
#tnum <- searchK(out$documents, out$vocab, K = c(10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80),
                #max.em.its = 100, 
                #prevalence = ~gender + platform, data = out$meta)
#plot(tnum) # figure saved and named as "Diagnostic Values by Number of Topics.png"
# These diagnostics indicate that a good number of topics would be around: xx


# Step 3: Run the function stm() to let the computer generate the most prevalent 10 Topics for you. 
  # The below code took around 3 minutes to run on my computer
PrevFit <- stm(documents = out$documents, vocab = out$vocab,
                       K = 20, prevalence =~gender + platform, max.em.its = 1000,
                       data = out$meta, init.type = "Spectral", seed=123456)

# Plot the keywords for the above 10 most prevalent topics
plot(PrevFit, type = "summary", xlim = c(0, 0.7), labeltype = "frex", n = 10)
  # You can click "Zoom" in the Plots box on the bottom right of your RStudio to see a larger version of the figure

# Inspect the keywords for each topic (FREX is the most useful one to look at to understand what each topic might suggest)
labelTopics(PrevFit)
topic <- labelTopics(PrevFit,c(1:20))


# To examine the example blog posts that are highly associated with topics the findThoughts function can be used
  # n=3 means I want to look at 3 example blog posts. topics=1 means I want to look at topic 1. You can adjust these two parameters
thoughts <- findThoughts(PrevFit, texts = data$raw_video_description, n = 5, topics = c(1))$docs[[1]]
thoughts # after reading these example posts, I think topic 1 suggests that this topic is talking about hillary's performance during the primary election
  #tip: if you feel it is hard to read these examples in the small Console window, copy paste to a word document for ease of reading!


# Step 4: Estimate the relationship between topic and meta data
out$meta$rating <- as.factor(out$meta$gender)
levels(out$meta$gender) # check what categories our variable gender has: [1] "Conservative" "Liberal". 
  #Note: The first category you see (in this case conservative) should be the value you put into the plot function for cov.value2

prep <- estimateEffect(1:20 ~gender + platform, PrevFit, meta = out$meta, uncertainty = "Global")
summary(prep)


# new visualize

# obtain topic-related statistics
theta <- data.frame(PrevFit$theta) 
theta 
  # theta = proportion of each topic for each post
mean(theta$X20) #Calculate prevelance of a topic 




t <- rep("NA",20)
for (i in 1:20){
  t[i] <- mean(theta[,i])
}
Prob <- rep("NA",20)
Frex <- rep("NA",20)
Lift <- rep("NA",20)
Score <- rep("NA",20)
for (i in 1:20){
  Prob[i] <- paste(topic$prob[i,1],topic$prob[i,2],topic$prob[i,3],topic$prob[i,4],topic$prob[i,5],topic$prob[i,6],topic$prob[i,7])
  Frex[i] <- paste(topic$frex[i,1],topic$frex[i,2],topic$frex[i,3],topic$frex[i,4],topic$frex[i,5],topic$frex[i,6],topic$frex[i,7])
  Lift[i] <- paste(topic$lift[i,1],topic$lift[i,2],topic$lift[i,3],topic$lift[i,4],topic$lift[i,5],topic$lift[i,6],topic$lift[i,7])
  Score[i] <- paste(topic$score[i,1],topic$score[i,2],topic$score[i,3],topic$score[i,4],topic$score[i,5],topic$score[i,6],topic$score[i,7])
}

# visualize
h <- plot(prep,covariate = "gender",topics=c(1:20),model=PrevFit,method="difference",cov.value1="female",cov.value2="male",xlim=c(-0.1,0.1),main="Effect of using female vs. male cues ",xlab = "Using male cues  ....... using female cues")

# generate the estimated effects for each topic: how likely is each topic to be reported upward
mean <- h$means
CI <- h$cis
m <- rep("NA",20)
down <- rep("NA",20)
upper <- rep("NA",20)
for (i in 1:20){
  m[i] <- mean[[i]]
  down[i] <- CI[[i]][1]
  upper[i] <- CI[[i]][2]
}
EF <- data.frame(m,down,upper,Prob,Frex,Lift,Score,t)

EF$m <- as.numeric(as.character(EF$m))
EF$down <- as.numeric(as.character(EF$down))
EF$upper <- as.numeric(as.character(EF$upper))

EF$ID <- c(1,2,3,4,5,6,7,8,9,10,
           11,12,13,14,15,16,17,18,19,20
           )

EF$Title <- c("Children vaccination/Baby products", #T1
              "Misinformation and climate change denial", #T2
              "Online discussions on climate change", #T3
              "Gardening products", #T4
              "Music and songs", #T5
              "Weather modification and the chemtrail conspiracy theory", #T6
              "Cannot label", #T7
              "Video classes on global warming", #T8
              "Meteorology science", #T9
              "Jimmy Kimmel's claims on climate change", #T10
              "Cannot label", #T11
              "News channels and services", #T12
              "Online class training program ads", #T13
              "News reports on climate change", #T14
              "Vaccination for children and women", #T15
              "Side effects and vaccination safety for children and pregnant women", #T16
              "Scientific research on climate issues", #T17
              "Accuracy of climate change predictions and trust in climate scientists", #T18
              "Greenhouse", #T19
              "Activists and actions on climate change" #T20
)


EF$Title <- as.character(EF$Title)


# plot labeled topics
d <- EF[which(EF$Title!="Cannot label"),]
d <- d[order(d$m),]
d$Title <- factor(d$Title, levels=unique(d$Title))
ggplot(d, aes(x=d$Title, y=d$m, ymin=d$down, ymax=d$upper))+
  geom_pointrange(size = 0.5) +
  geom_hline(yintercept = 0, linetype=2, size = 1) +
  coord_flip() +
  ylim(-0.07, 0.07) +
  xlab("") + ylab("Difference in Topic Proportion") +
  annotate("text", y=-0.050, x=8.5, label= "Using\n men-related cues",size =6.5, fontface = "bold") +
  annotate("text", y=0.050, x=8.5, label= "Using\n women-related cues", size =6.5, fontface = "bold") +
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(axis.title.x = element_text(size=16) ,
        axis.text.x  = element_text(size=16),axis.text.y = element_text(size=20))


# correlation between prevalence and upward reporting
cor(as.numeric(paste(EF$t)),as.numeric(paste(EF$m))) #0.1182979

